import pandas as pdimport numpy as npimport matplotlib.pyplot as pltimport seaborn as sbfrom plotly.subplots import make_subplotsimport plotly.graph_objects as goimport plotly.express as pxfrom pyod.models.pca import PCAfrom pyod.models.knn import KNNfrom warnings import filterwarningsfrom sklearn.model_selection import train_test_split filterwarnings('ignore')plt.rcParams['figure.figsize']=[18,12]xxxxxxxxxxdf=pd.read_csv('inpatientCharges.csv')df.head()xxxxxxxxxxdf.shapexxxxxxxxxxdf.columnsxxxxxxxxxxdf.dtypesxxxxxxxxxxdf.rename(columns={' Average Covered Charges ':'Average Covered Charges', ' Total Discharges ':'Total Discharges', ' Average Total Payments ':'Average Total Payments'},inplace=True)xxxxxxxxxxdf.columnsxxxxxxxxxxx=list(df.columns)[-3:]for i in x: df[i]=df[i].apply(lambda x: float(x.split('$')[1]))xxxxxxxxxxdf['Total Payment']=df['Average Total Payments']* df['Total Discharges']df['Medicare Payment amount']=df['Average Medicare Payments'] * df['Total Discharges']df['Total Covered Charge Amount']=df['Average Covered Charges'] * df['Total Discharges']xxxxxxxxxxdf.head()xxxxxxxxxxdf.dtypesxxxxxxxxxxdf.describe()xxxxxxxxxxdf.describe(include='object')xxxxxxxxxxduplicate_rows_df = df[df.duplicated()]print("number of duplicate rows: ", duplicate_rows_df.shape)xxxxxxxxxx# Null Valuetotal=df.isnull().sum().sort_values(ascending=False)percent=(df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)missed=pd.concat([total,percent],axis=1,keys=['total','percent'])missedxxxxxxxxxx#seggregating numerical and categorical columnsnum=list(df.select_dtypes(include='number').columns)cat=list(df.select_dtypes(include='object').columns)xxxxxxxxxxnumxxxxxxxxxxcatxxxxxxxxxxfig,ax=plt.subplots(nrows=2,ncols=3,figsize=[18,12])for i,j in zip(num,ax.flatten()): sb.boxplot(df[i],ax=j).set(xlabel=i)plt.show()xxxxxxxxxx# # Create a figure with 2x2 subplots# fig = make_subplots(rows=2, cols=3)# # Add a box plot to each subplot# fig.add_trace(go.Box(x=df[num[0]], name=num[0]),row=1,col=1)# fig.add_trace(go.Box(x=df[num[1]], name=num[1]),row=1,col=2)# fig.add_trace(go.Box(x=df[num[2]], name=num[2]),row=1,col=3)# fig.add_trace(go.Box(x=df[num[3]], name=num[3]),row=2,col=1)# fig.add_trace(go.Box(x=df[num[4]], name=num[4]),row=2,col=2)# fig.add_trace(go.Box(x=df[num[5]], name=num[5]),row=2,col=3)# fig.update_layout(margin=dict(l=20, r=20, t=20, b=20),paper_bgcolor="LightSteelBlue",width=1000,height=600)# # Show the plot# fig.show()xxxxxxxxxxfig,ax=plt.subplots(nrows=2,ncols=3,figsize=[18,12])for i,j in zip(num,ax.flatten()): sb.histplot(df[i],ax=j,kde=True)plt.show()xxxxxxxxxx# fig = make_subplots(rows=2, cols=3)# # Add a box plot to each subplot# fig.add_trace(go.Histogram(x=df[num[0]], name=num[0]),row=1,col=1)# fig.add_trace(go.Histogram(x=df[num[1]], name=num[1]),row=1,col=2)# fig.add_trace(go.Histogram(x=df[num[2]], name=num[2]),row=1,col=3)# fig.add_trace(go.Histogram(x=df[num[3]], name=num[3]),row=2,col=1)# fig.add_trace(go.Histogram(x=df[num[4]], name=num[4]),row=2,col=2)# fig.add_trace(go.Histogram(x=df[num[5]], name=num[5]),row=2,col=3)# fig.update_layout(margin=dict(l=20, r=20, t=20, b=20),paper_bgcolor="LightSteelBlue",width=1000,height=600)# # Show the plot# fig.show()xxxxxxxxxxsb.countplot(data=df,x='Provider State')plt.show()xxxxxxxxxxdf0=pd.pivot_table(data=df,index='Provider State',values='Total Discharges',aggfunc='mean').reset_index()sb.barplot(data=df0,x='Provider State',y='Total Discharges')plt.show()xxxxxxxxxxsb.scatterplot(data=df,x='Provider Zip Code',y='Average Total Payments',hue='Provider State')plt.show()xxxxxxxxxxsb.pairplot(data=df)Created a feature for Provider Region like dividing US states into four regions (Northeast, West, Midwest, South).This will useful for checking which region has more fraud.
xxxxxxxxxxstate_to_region = { 'CT': 'Northeast', 'ME': 'Northeast', 'MA': 'Northeast', 'NH': 'Northeast', 'RI': 'Northeast', 'VT': 'Northeast', 'NJ': 'Northeast', 'NY': 'Northeast', 'PA': 'Northeast', 'AK': 'West', 'CA': 'West', 'HI': 'West', 'NV': 'West', 'OR': 'West', 'WA': 'West','ID': 'West', 'MT': 'West', 'WY': 'West', 'UT': 'West', 'CO': 'West', 'NM': 'West','AZ': 'West', 'IL': 'Midwest', 'IN': 'Midwest', 'MI': 'Midwest', 'OH': 'Midwest', 'WI': 'Midwest','ND': 'Midwest', 'MN': 'Midwest', 'SD': 'Midwest', 'NE': 'Midwest', 'IA': 'Midwest','KS': 'Midwest', 'MO': 'Midwest', 'AL': 'South', 'AR': 'South','DC': 'South', 'DE': 'South', 'FL': 'South', 'GA': 'South', 'KY': 'South', 'LA': 'South', 'MD': 'South', 'MS': 'South', 'NC': 'South', 'OK': 'South', 'SC': 'South', 'TN': 'South', 'TX': 'South', 'VA': 'South', 'WV': 'South'}df['Provider Region'] = df['Provider State'].map(state_to_region)df.head()xxxxxxxxxxdf['Provider Region'].value_counts().plot.bar()From above charts we can see the data divided among the regions of USA and south is the region with most providers, while west has the least number of providers.
xxxxxxxxxxdf.shapeSummary: These feature represents the standardized score of the total number of discharges, Average Covered Charges, Average Total Payments, Average Medicare Payments by a provider region, DRG Definition, Hospital Referral Region, Provider City, Provider Name respectively.
Insight:
An unusually high or low number of discharges, could indicate irregularities in patient admissions or discharge practices. This could be a sign of fraudulent activities like patient churning or unnecessary admissions.
Extreme values in this features may indicate that a supplier charges much more or less than competitors in the same location. This might suggest service overcharging, service undercharging as part of a bribe fraudulent activity, or service fraud.
Total payments that are unusually high or low may indicate billing problems. High levels may indicate overutilization of services, while low values may indicate a lack or unreported services.
Significant differences in Medicare payments can be indicators of fraudulent activity, such as false claims or services not provided. Both abnormally high and unusually low results require additional investigation.
xxxxxxxxxx#Z-value for the Provider RegionList=df[['Total Discharges','Average Covered Charges','Average Total Payments','Average Medicare Payments']].groupby(df['Provider Region'])list1=List.transform('mean')list2=List.transform('std')data=(df[['Total Discharges','Average Covered Charges','Average Total Payments','Average Medicare Payments']] - list1)/list2data.columns = [f"{col}_z-value_PR" for col in data.columns]data.head()Discharges Total_z-value_PR: A high positive z-value shows that a provider has more discharges than the region's standard, which might imply significant patient turnover or over-admissions in comparison to competitors. A negative z-value indicates fewer discharges than predicted, which might be related to lower admission rates or stricter admission requirements.
Covered Charges Average_z-value_PR: Positive z-values show that a provider's charges are greater than the regional average, which could indicate overcharging or higher service costs. Negative numbers may indicate undercharging as a result of reduced operating expenses or alternative pricing methods.
Average Total Payments_z-value_PR: This measure compares the provider's total payments to the regional average. A greater z-value may indicate possible overutilization or overbilling of services, whilst a lower value may imply underbilling or more efficient service supply.
Medicare Average Payments_z-value_PR: Positive z-values indicate that the provider receives more from Medicare than the area average, which might be attributable to increased service utilisation or invoicing for more sophisticated treatments. Negative z-values may imply decreased reliance on Medicare, either as a result of a patient mix with less Medicare coverage or more cautious billing procedures.
xxxxxxxxxxlist=df[['Total Discharges','Average Covered Charges','Average Total Payments','Average Medicare Payments']].groupby(df['DRG Definition'])list1=list.transform('mean')list2=list.transform('std')data1=(df[['Total Discharges','Average Covered Charges','Average Total Payments','Average Medicare Payments']] - list1)/list2data1.columns = [f"{col}_z-value_DRG" for col in data1.columns]data1.head()Total Discharges_z-value_DRG: A high z-value, like the 2.73 for the first entry, suggests significantly higher patient discharges compared to others in the same DRG category, which might indicate high treatment frequency or potential overuse of services. Negative values, like -0.78, indicate lower than average discharges, which could mean more selective patient treatment or potentially insufficient care.
Average Covered Charges_z-value_DRG: Positive z-scores imply that a provider's charges are higher than the average for that DRG category, which could suggest overpricing or specialization in more complex procedures. Negative scores suggest lower-than-average charges, which could reflect more efficient care, lower cost structures, or potentially undercharging.
Average Total Payments_z-value_DRG: This metric reflects whether the provider's total payments are high or low relative to the DRG average. For example, a z-score of -1.03 indicates payments are lower than average, which may raise questions about the provider's service quality or billing practices.
Average Medicare Payments_z-value_DRG: These z-scores indicate how a provider's Medicare payments compare to the average for the DRG. Negative values, such as -1.15, could suggest a provider is receiving less Medicare reimbursement than peers, possibly due to lower service utilization or conservative billing.
xxxxxxxxxxlist=df[['Total Discharges','Average Covered Charges','Average Total Payments','Average Medicare Payments']].groupby(df['Hospital Referral Region Description'])list1=list.transform('mean')list2=list.transform('std')data2=(df[['Total Discharges','Average Covered Charges','Average Total Payments','Average Medicare Payments']] - list1)/list2data2.columns = [f"{col}_z-value_Reff" for col in data2.columns]data2.head()Total Discharges_z-value_Reff: A high z-score, like the 1.36 for the first entry, indicates that a hospital has a higher number of discharges compared to the average in its referral region. This could suggest a higher patient volume or a tendency to admit and discharge more frequently than other hospitals in the area.
Average Covered Charges_z-value_Reff: This measures how a hospital's average charges for services compare to the regional average. A positive z-score indicates higher than average charges, which might reflect a hospital's pricing strategy or suggest that it provides more specialized or intensive services.
Average Total Payments_z-value_Reff: A negative z-score here suggests that a hospital is receiving lower total payments than the regional average, possibly indicating efficiency in service delivery or challenges in obtaining reimbursements.
Average Medicare Payments_z-value_Reff: Negative z-scores suggest that a hospital's Medicare payments are less than the regional average. This could raise questions about the hospital's Medicare patient demographics, the mix of services provided to Medicare patients, or its Medicare billing practices.
xxxxxxxxxxlist=df[['Total Discharges','Average Covered Charges','Average Total Payments','Average Medicare Payments']].groupby(df['Provider City'])list1=list.transform('mean')list2=list.transform(np.std)data3=(df[['Total Discharges','Average Covered Charges','Average Total Payments','Average Medicare Payments']] - list1)/list2data3.columns = [f"{col}_z-value_ProvCity" for col in data3.columns]data3.head()Total Discharges_z-value_ProvCity: Positive z-scores, like 0.83, suggest that a provider has more discharges than the average for their city. This might indicate higher patient throughput. Negative z-scores, such as -0.82, show fewer discharges than the city average, potentially pointing to a smaller operational scale or more selective patient admission.
Average Covered Charges_z-value_ProvCity: The z-scores indicate whether a provider's charges are above or below the city average. Most values shown are negative, suggesting these providers generally charge less than the city average for their services.
Average Total Payments_z-value_ProvCity: These z-scores compare the total payments received to the city average. Most providers in the table have negative z-scores, implying they receive less in payments compared to the city average, which could be due to a variety of factors including the types of services provided, patient demographics, or efficiency in service delivery.
Average Medicare Payments_z-value_ProvCity: Similar to total payments, these z-scores compare Medicare payments against the city average. Negative values, such as -0.54, suggest a provider's Medicare reimbursements are below the city average, possibly indicating less reliance on Medicare billing, fewer services provided to Medicare patients, or more conservative billing practices.
xxxxxxxxxxlist=df[['Total Discharges','Average Covered Charges','Average Total Payments','Average Medicare Payments']].groupby(df['Provider Name'])list1=list.transform('mean')list2=list.transform(np.std)data4=(df[['Total Discharges','Average Covered Charges','Average Total Payments','Average Medicare Payments']] - list1)/list2data4.columns = [f"{col}_z-value_ProvName" for col in data4.columns]data4.head()Total Discharges_z-value_ProvName: Positive z-scores (like 0.60 for the first entry) suggest a provider has a higher number of discharges compared to the average for that provider name. This might suggest a higher patient volume or different operational practices. Negative z-scores indicate fewer discharges, which could be due to various factors such as smaller facility size or patient demographics.
Average Covered Charges_z-value_ProvName: This metric reflects the provider's pricing relative to the average for that provider name. Most values in the table are negative, which suggests these particular providers charge less for services than the average for their provider name group.
Average Total Payments_z-value_ProvName: Negative z-scores across the board suggest that these providers receive lower total payments than the average for their provider name group. This could indicate more efficient service provision, a different payer mix, or possibly an underutilization of services.
Average Medicare Payments_z-value_ProvName: Similar to total payments, these z-scores reflect how Medicare payments to the provider compare to the average for their provider name group. Negative scores, such as -0.53 for the third entry, could suggest lower reimbursement rates from Medicare, potentially due to a variety of billing practices or service offerings.
xxxxxxxxxxlist2.isnull().sum()xxxxxxxxxxdf=pd.concat([df,data],axis=1)df=pd.concat([df,data1],axis=1)df=pd.concat([df,data2],axis=1)df=pd.concat([df,data3],axis=1)df=pd.concat([df,data4],axis=1)xxxxxxxxxxdf1=df.iloc[:,16:]xxxxxxxxxxdf1.dropna(inplace=True)xxxxxxxxxxdf1.info()xxxxxxxxxxdf1.head()xxxxxxxxxxfig,ax=plt.subplots(nrows=5,ncols=4,figsize=[20,15])for i,j in zip(df1.columns,ax.flatten()): sb.histplot(df1[i],ax=j,kde=True)plt.show()xxxxxxxxxxsb.pairplot(data=df1)